gapminder_clean <- read_csv("gapminder_clean.csv")
datatable(gapminder_clean)
gapminder_clean %>%
filter(Year == 1962) %>%
ggplot(aes(x = `CO2 emissions (metric tons per capita)`, y = gdpPercap)) +
geom_point()+
theme_classic()
## Warning: Removed 151 rows containing missing values (geom_point).
filtered_data <- gapminder_clean %>%
filter(Year == 1962)
correlation <- cor.test(filtered_data$gdpPercap, filtered_data$`CO2 emissions (metric tons per capita)`)
cat("The Correlation is: ", correlation$estimate, "\n P-VALUE is: ", correlation$p.value)
## The Correlation is: 0.9260817
## P-VALUE is: 1.128679e-46
gapminder_clean %>%
group_by(Year) %>%
summarise(correlation = cor.test(`CO2 emissions (metric tons per capita)`, gdpPercap)$estimate) %>%
arrange(desc(correlation)) %>%
filter(correlation == max(correlation))
## # A tibble: 1 x 2
## Year correlation
## <dbl> <dbl>
## 1 1967 0.939
filter_data_1967 <- gapminder_clean %>%
filter(Year == 1967)
The year in which the correlation between Carbon dioxide and gdpPercap was the highest was 1967.
g <- ggplot(filter_data_1967, aes(x = `CO2 emissions (metric tons per capita)`, y = gdpPercap, color = continent, size = pop)) +
geom_point()+
theme_classic()
ggplotly(g)
gapminder_clean %>%
na.omit() %>%
ggplot(aes(x = continent, y = `Energy use (kg of oil equivalent per capita)`)) +
geom_boxplot() +
scale_y_log10()+
theme_classic()
Select Continent and
Energy use (kg of oil equivalent per capita) Columns
continent_data <- gapminder_clean %>%
na.omit() %>%
select(continent, `Energy use (kg of oil equivalent per capita)`)
continent_data %>%
group_by(continent) %>%
summarise(Mean_Energy_use = mean(`Energy use (kg of oil equivalent per capita)`, na.rm = TRUE)) %>%
arrange(Mean_Energy_use)
## # A tibble: 5 x 2
## continent Mean_Energy_use
## <chr> <dbl>
## 1 Africa 665.
## 2 Americas 1359.
## 3 Asia 1521.
## 4 Europe 3517.
## 5 Oceania 4506.
To Check the relationships between Continents in terms of Energy use, i will apply Analysis of Variance (ANOVA). this is because i am comparing means of more than two groups.
continent_data %>%
aov(`Energy use (kg of oil equivalent per capita)` ~ continent, data = .) %>%
summary()
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 4 6.798e+08 169938717 58.32 <2e-16 ***
## Residuals 516 1.504e+09 2914122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
continent_data %>%
aov(`Energy use (kg of oil equivalent per capita)` ~ continent, data = .) %>%
TukeyHSD()
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = `Energy use (kg of oil equivalent per capita)` ~ continent, data = .)
##
## $continent
## diff lwr upr p adj
## Americas-Africa 693.7245 106.4060 1281.0429 0.0113441
## Asia-Africa 855.6667 276.9503 1434.3831 0.0005699
## Europe-Africa 2852.5003 2272.6061 3432.3945 0.0000000
## Oceania-Africa 3841.4070 2528.8619 5153.9521 0.0000000
## Asia-Americas 161.9422 -432.9740 756.8584 0.9456942
## Europe-Americas 2158.7758 1562.7138 2754.8379 0.0000000
## Oceania-Americas 3147.6825 1827.9146 4467.4504 0.0000000
## Europe-Asia 1996.8336 1409.2456 2584.4216 0.0000000
## Oceania-Asia 2985.7403 1669.7779 4301.7026 0.0000000
## Oceania-Europe 988.9067 -327.5741 2305.3874 0.2409293
Overall, there is a difference between continents in terms of energy consumption. This is noted using the p-value, which less than 0.05. Going by Continents, energy consumption between Oceania and Europe is not different, so is between America and Asia. as noted in the results above.
Europe_Asia <-gapminder_clean %>%
filter(Year > 1990, continent %in% c("Europe", "Asia"))
Europe_Asia%>%
ggplot(aes(x=continent, y=`Imports of goods and services (% of GDP)`))+
geom_boxplot()+
theme_classic()
## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
Europe_Asia <- droplevels(Europe_Asia)
t.test(Europe_Asia$`Imports of goods and services (% of GDP)` ~ continent, data = Europe_Asia)
##
## Welch Two Sample t-test
##
## data: Europe_Asia$`Imports of goods and services (% of GDP)` by continent
## t = 1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
## -2.321099 12.433240
## sample estimates:
## mean in group Asia mean in group Europe
## 46.84531 41.78924
The differences in Imports of goods and Services is not significant between Asia and Europe. this can be observed from the p-value which is greater 0.05, i fail to reject the null hypothesis which suggests that there is no difference.
average_pop_country_years <- gapminder_clean %>%
group_by(`Country Name`, Year)%>%
summarise(Average_per_year = mean(`Population density (people per sq. km of land area)`))%>%
arrange(desc(Average_per_year))
## `summarise()` has grouped output by 'Country Name'. You can override using the `.groups` argument.
g <- ggplot(average_pop_country_years, aes(x = Year, y = Average_per_year, color = `Country Name`))+
geom_line()+
ggtitle("Population Density across all years")+
ylab("Population density (people per sq. km of land area)'")+
xlab("Year")+
theme_classic()+
theme(legend.position = "none")
ggplotly(g)
life_exp_over_years <- gapminder_clean %>%
select(Year, `Country Name` , `Life expectancy at birth, total (years)`) %>%
filter(Year %in% c(min(Year),max(Year))) %>%
spread(Year, `Life expectancy at birth, total (years)`)%>%
mutate(pop_dif = `2007` - `1962`)%>%
arrange(desc(pop_dif))%>%
top_n(10,pop_dif)
datatable(life_exp_over_years)
top_high_expe_countries <- life_exp_over_years$`Country Name`
ggplo <- gapminder_clean%>%
filter(`Country Name` %in% top_high_expe_countries)%>%
ggplot(aes(x= Year, y = `Life expectancy at birth, total (years)`, color = `Country Name`))+
geom_line()+
geom_point()
ggplotly(ggplo)
It can be observed from the plot and the table that the countries Maldives and Bhutan have had an increase in life Expectancy since 1962. With Maldives having increased by 37 years While Bhutan increased by 33 years.